
# import tensorflow_datasets as tfds
import tensorflow as tf
import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
import numpy as np
from matplotlib import gridspec
# # import skimage.data as skid
import cv2
from glob import glob
# from sklearn.model_selection import train_test_split
# import random
# # from sklearn.cluster import KMeans
# from sklearn.neighbors import NearestNeighbors
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
IMG_SIZE = 28
VECTOR_SIZE = IMG_SIZE * IMG_SIZE
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train_flat = x_train.reshape(x_train.shape[0],x_train.shape[1]*x_train.shape[2])
x_test_flat = x_test.reshape(x_test.shape[0],x_test.shape[1]*x_test.shape[2])
d = {k:0 for k in range(0,10)}
for k in y_train:
d[k]= d[k]+ 1
d
{0: 5923,
1: 6742,
2: 5958,
3: 6131,
4: 5842,
5: 5421,
6: 5918,
7: 6265,
8: 5851,
9: 5949}
import plotly.express as px
fig = px.bar(x=d.keys(),y=d.values())
fig.show()
def show_images_in_sub_plots(image_list,nrows, ncols, parameters_for_printing=None,add_text=None):
### create the subplots (Also support a one plot)
fig, ax = plt.subplots(nrows*ncols,figsize=(17, 10))
j = 0 ### counter
### if you want to add a text on the title of each plot
add_text = "" if add_text == None else add_text+": "
### for loop for create all the plots (in an array of plots and after it I will reshape it for the subplot shape that we want)
for img in range(nrows*ncols):
if len(image_list[j].shape) > 2: ### check if the image is Gray-scale/RBG (By the number of the channels)
ax[img].imshow(image_list[j]) ### ploting the image
else:
ax[img].imshow(image_list[j],cmap='gray')
if not isinstance(parameters_for_printing,type(None)): ### check if you given a list of the parameters for show it with the plots
ax[img].set_title(f"{add_text}{parameters_for_printing[j]}")
j+=1
### Reshape the subplot for the requested subplot shape
gs = gridspec.GridSpec(nrows, ncols)
for i, ax in enumerate(fig.axes):
ax.set_position(gs[i].get_position(fig))
plt.show()
show_images_in_sub_plots(x_train[:12],3,4,y_train[:12],"This Number is")
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
K_arr = np.arange(1,11)
models = [KNeighborsClassifier(n_neighbors=k,n_jobs=-1).fit(x_train.reshape((x_train.shape[0],x_train.shape[1]*x_train.shape[2])),y_train) for k in K_arr]
predictions = [accuracy_score(model.predict(x_test.reshape((x_test.shape[0],x_test.shape[1]*x_test.shape[2]))),y_test) for model in models]
predictions = np.array(predictions)
color_discrete_sequence = [100]*len(predictions)
color_discrete_sequence[np.argmax(predictions)] = 10
print(f"The Best K is:{np.argmax(predictions)+1} with {np.max(predictions)}% accuracy")
fig = px.bar(x=np.arange(1,11),y=predictions*100,color=color_discrete_sequence,text_auto=True,labels={'x': 'K', 'y':'Accuracy Score'},title="The results of KNN")
fig.update_coloraxes(showscale=False)
The Best K is:3 with 0.9705% accuracy
When classifying to more than two groups or when using an even value for k, it might be necessary to break a tie in the number of nearest neighbors. Options are 'random', which selects a random tiebreaker, and 'nearest', which uses the nearest neighbor among the tied groups to break the tie.
from sklearn.decomposition import PCA
x_train_flat = x_train.reshape((x_train.shape[0],x_train.shape[1]*x_train.shape[2]))
pca = PCA(6).fit(x_train_flat)
plt.imshow(pca.mean_.reshape((28,28)),cmap='gray')
show_images_in_sub_plots(pca.components_.reshape(6,28,28),1,6,np.arange(1,7),"component ")
plt.show()
px.line(pca.explained_variance_)
# we want 95%=0.95 varianve, and after this 80%=0.8 variance
fraction_List = [0.95, 0.8]
# gets the PCA nComponents by the variance
print("RESULTS:")
for fraction in fraction_List:
pca = PCA(fraction)
pca.fit(x_train.reshape(60000,28*28))
percantage = str(fraction * 100)
pcaNcomponents = str(pca.n_components_)
print("To get " + percantage + "% variance, I need " + pcaNcomponents + " components")
RESULTS: To get 95.0% variance, I need 154 components To get 80.0% variance, I need 44 components
pca = PCA(n_components=2)
X_2d = pca.fit_transform(x_train.reshape(60000,28*28))
px.scatter(x=X_2d[:,0],y=X_2d[:,1],color=y_train,width=800, height=600)
def KNN_fit_and_score(k_min, k_max, x, y, x_test, y_test):
scores = {k:0 for k in range(k_min, k_max + 1)}
for k in range (k_min, k_max + 1):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(x, y)
score = knn.score(x_test, y_test)
print(f'k={k} : {score}')
scores[k] = score
return scores
scores = KNN_fit_and_score(1, 10, x_train_flat, y_train, x_test_flat, y_test)
print(scores)
# displays the figure
k=1 : 0.9691
k=2 : 0.9627
k=3 : 0.9705
k=4 : 0.9682
k=5 : 0.9688
k=6 : 0.9677
k=7 : 0.9694
k=8 : 0.967
k=9 : 0.9659
k=10 : 0.9665
{1: 0.9691, 2: 0.9627, 3: 0.9705, 4: 0.9682, 5: 0.9688, 6: 0.9677, 7: 0.9694, 8: 0.967, 9: 0.9659, 10: 0.9665}
def getScoreOfKnn(currentK, xTrain, yTrain, xTest, yTest):
kString = str(currentK)
print('computes score of KNN (K is: ' + kString + '):')
print('start the training and fitting')
knn = KNeighborsClassifier(n_neighbors=currentK)
knn.fit(xTrain, yTrain)
print('terminates the training, and now begins the predicting')
knnPredicts = knn.predict(xTest)
score = accuracy_score(knnPredicts, yTest) * 100
print('the prediction accuracy score for K=' + kString + ' is: ' + str(score) + "%")
return score
from matplotlib.widgets import Slider
from sklearn.preprocessing import StandardScaler
def show_all(image_list, title_list=None):
if title_list is None:
title_list = ['' for i in range(len(image_list))]
plt.figure(figsize=[20, 20])
assert len(image_list) == len(title_list), "List sizes should be equal"
N = len(image_list)
for index, (img, title) in enumerate(zip(image_list, title_list)):
plt.subplot(1, N, index+1)
plt.title(title)
if len(img.shape) != 3:
plt.imshow(img, cmap='gray')
else:
plt.imshow(cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
plt.show()
def plot(rows, cols, x, y):
fig, ax = plt.subplots(rows, cols)
ax = ax.flatten()
for i in range(rows * cols):
if i < len(x) and i < len(y):
ax[i].set_title(f'label: {y[i]}', fontsize='small', loc='left')
ax[i].imshow(x[i], cmap='gray_r')
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_val, y_val = x_train, y_train
x_train, y_train = x_train, y_train
plot(3, 4, x_train[:12], y_train[:12])
def euclidian_distance(x1, x2):
return np.sqrt(np.sum((x1-x2)**2))
# Using sklean for faster computation
from sklearn.neighbors import KNeighborsClassifier
K_START, K_END = 1, 10
scores = KNN_fit_and_score(K_START, K_END, x_train_flat, y_train, x_test_flat, y_test)
print(scores)
k=1 : 0.9691
k=2 : 0.9627
k=3 : 0.9705
k=4 : 0.9682
k=5 : 0.9688
k=6 : 0.9677
k=7 : 0.9694
k=8 : 0.967
k=9 : 0.9659
k=10 : 0.9665
{1: 0.9691, 2: 0.9627, 3: 0.9705, 4: 0.9682, 5: 0.9688, 6: 0.9677, 7: 0.9694, 8: 0.967, 9: 0.9659, 10: 0.9665}
# We can see that k=4 performed best on the validation data
plt.plot(list(scores.keys()), list(scores.values()))
[<matplotlib.lines.Line2D at 0x1b82afcb8b0>]
# PCA
#Import required modules
from sklearn.decomposition import PCA
import plotly.express as px
NUM_PRINCIPAL_COMPONENTS = 6
def get_pca_components(n_components, x):
pca = PCA(n_components=n_components)
components = pca.fit_transform(x)
return pca, components
def generate_and_plot_pc(num_components, x, y):
pca, components = get_pca_components(num_components, x)
labels = {
str(i): f"PC {i+1} ({var:.1f}%)"
for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}
fig = px.scatter_matrix(
components,
labels=labels,
dimensions=range(NUM_PRINCIPAL_COMPONENTS),
color=y,height=800,width=1000)
fig.update_traces(diagonal_visible=False)
fig.show()
return pca, components
# Visualize all the principal components
# Now, we apply PCA the same dataset, and plot all the components. We use the px.scatter_matrix trace to display our results, but this time our features are the resulting principal components, ordered by how much variance they are able to explain.
generate_and_plot_pc(NUM_PRINCIPAL_COMPONENTS, x_train_flat, y_train)
(PCA(n_components=6),
array([[ 123.93266301, -312.67489625, -24.5123551 , -555.75630373,
-27.32383823, -232.47153695],
[1011.71836932, -294.85716425, 596.34142013, -460.76225557,
-827.26856776, -181.95232355],
[ -51.84948811, 392.17299968, -188.50056632, 521.01557168,
-306.64797556, -1.80762971],
...,
[-178.05347838, 160.07796218, -257.61487418, -714.64924171,
-210.29056715, -462.45973886],
[ 130.60620829, -5.59341744, 513.86347487, 342.69579499,
-521.09159461, -540.50665489],
[-173.43596174, -24.7192802 , 556.02400817, -120.85671067,
-198.0292573 , 232.99762183]]))
def get_total_var_explained(num_components, x, y):
pca, components = get_pca_components(num_components, x)
return pca.explained_variance_ratio_.sum() * 100
n_components_list = [i for i in range(10, 201, 10)]
var_explained_list = []
for i in n_components_list:
var_explained_list.append(get_total_var_explained(i, x_train_flat, y_train))
# Total Variance explained by number of components
# As we can see, it takes 50 components for 82% variance and 160 components for 95% variance
plt.plot(n_components_list, var_explained_list)
plt.title('Total Variance explained by number of components')
plt.xlabel('n')
plt.ylabel('Total Variance %')
Text(0, 0.5, 'Total Variance %')
# Question one with 2, 10, 20 components
def KNN_and_PCA(num_components, x, y, x_val, y_val):
pca, components = get_pca_components(num_components, x)
_, components_val = get_pca_components(num_components, x_val)
K_START, K_END = 1, 10
scores = KNN_fit_and_score(K_START, K_END, components, y, components_val, y_val)
return scores
scores = KNN_and_PCA(2, x_train_flat, y_train, x_test_flat, y_test)
print(scores)
k=1 : 0.1071
k=2 : 0.1117
k=3 : 0.1121
k=4 : 0.115
k=5 : 0.1164
k=6 : 0.1158
k=7 : 0.1148
k=8 : 0.1169
k=9 : 0.1186
k=10 : 0.1185
{1: 0.1071, 2: 0.1117, 3: 0.1121, 4: 0.115, 5: 0.1164, 6: 0.1158, 7: 0.1148, 8: 0.1169, 9: 0.1186, 10: 0.1185}
k_values = [2, 5, 10, 50, 100, 150]
NUM_EXAMPLES = 10000
imgs = []
for k in k_values:
pca = PCA(k)
transformed = pca.fit_transform(x_test_flat)
reconstructed = pca.inverse_transform(transformed)
reconstructed = np.reshape(reconstructed, (NUM_EXAMPLES, 28, 28))
imgs.append(reconstructed[0])
labels = [f'Reconstruction - {k}' for k in k_values]
x_train_by_digit = np.array([x_train_flat[y_train == i] for i in range(10)])
y_train_by_digit = []
for i in range(10):
print(f'Label:{i} shape:', x_train_by_digit[i].shape)
y_train_by_digit.append(np.ones(x_train_by_digit[i].shape[0]) * [i])
y_train_by_digit = np.array(y_train_by_digit)
Label:0 shape: (5923, 784) Label:1 shape: (6742, 784) Label:2 shape: (5958, 784) Label:3 shape: (6131, 784) Label:4 shape: (5842, 784) Label:5 shape: (5421, 784) Label:6 shape: (5918, 784) Label:7 shape: (6265, 784) Label:8 shape: (5851, 784) Label:9 shape: (5949, 784)
def get_models_by_digit(n, x, y):
pca_list = []
component_list = []
for i in range(10):
pca_i, components_i = generate_and_plot_pc(n, x_train_by_digit[i], y_train_by_digit[i])
pca_list.append(pca_i)
component_list.append(components_i)
return pca_list, component_list
def generate_and_plot_pc(num_components, x, y):
pca, components = get_pca_components(num_components, x)
labels = {
str(i): f"PC {i+1} ({var:.1f}%)"
for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}
fig = px.scatter_matrix(
components,
labels=labels,
dimensions=range(NUM_PRINCIPAL_COMPONENTS),
color=y)
fig.update_traces(diagonal_visible=False)
fig.show()
return pca, components
def get_pca_components(n_components, x):
pca = PCA(n_components=n_components)
components = pca.fit_transform(x)
return pca, components
NUM_PRINCIPAL_COMPONENTS = 6
pca_list, component_list = get_models_by_digit(NUM_PRINCIPAL_COMPONENTS, x_train_by_digit, y_train_by_digit)
def get_reconstructions_by_model(pca_list, k, x_test):
imgs = [[] for i in range(len(pca_list))]
orig_imgs = []
for i, pca in enumerate(pca_list):
transformed = pca.transform(x_test)
reconstructed = pca.inverse_transform(transformed)
reconstructed = np.reshape(reconstructed, (reconstructed.shape[0], 28, 28))
orig_imgs.append(np.reshape(x_test[0], (28, 28)))
imgs[i].append(reconstructed[0])
imgs = np.array(imgs)
imgs = np.reshape(imgs, (10, 28, 28))
orig_imgs = np.array(orig_imgs)
return imgs, orig_imgs
k = NUM_PRINCIPAL_COMPONENTS
NUM_EXAMPLES = 10000
imgs, orig_imgs = get_reconstructions_by_model(pca_list, k, x_test_flat)
test_transformed_by_digit = []
for i in range(10):
test_transformed_by_digit.append(pca_list[i].transform(x_test_flat))
test_transformed_by_digit = np.array(test_transformed_by_digit)
labels = [f'Reconstruction - {k}' for k in range(10)]
labels2 = [f'Orig' for k in range(10)]
def calc_dist_from_orig(reconstructed_imgs, orig_img):
distances = []
for re_img in reconstructed_imgs:
dist = euclidian_distance(re_img,orig_img)
distances.append(dist)
print(f'Distances: {distances}')
return np.argmin(np.array(distances))
match_label = calc_dist_from_orig(imgs, orig_imgs[0])
print(match_label)
Distances: [1658.0270032938195, 1830.9341052333757, 1512.0955422743584, 1539.186723435332, 1649.5781586694345, 1711.221499695632, 1767.0260078832487, 838.5562250463514, 1771.1498169956656, 1413.3859782409722] 7